Parkinson’s Disease (PD) is a degenerative neurological disorder marked by decreased dopamine levels in the brain. It manifests itself through a deterioration of movement, including the presence of tremors and stiffness. There is commonly a marked effect on speech, including dysarthria (difficulty articulating sounds), hypophonia (lowered volume), and monotone (reduced pitch range). Additionally, cognitive impairments and changes in mood can occur, and risk of dementia is increased.
Traditional diagnosis of Parkinson’s Disease involves a clinician taking a neurological history of the patient and observing motor skills in various situations. Since there is no definitive laboratory test to diagnose PD, diagnosis is often difficult, particularly in the early stages when motor effects are not yet severe. Monitoring progression of the disease over time requires repeated clinic visits by the patient. An effective screening process, particularly one that doesn’t require a clinic visit, would be beneficial. Since PD patients exhibit characteristic vocal features, voice recordings are a useful and non-invasive tool for diagnosis. If machine learning algorithms could be applied to a voice recording dataset to accurately diagnosis PD, this would be an effective screening step prior to an appointment with a clinician.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score,roc_auc_score,roc_curve,auc
# calculate accuracy measures and confusion matrix
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
parkinsons=pd.read_csv('parkinsons.data')
parkinsons.head()
Target Column rearrange:- As our Target Column(Personal Loan) is in middle of dataframe so for more convinient I have drop the personal loan column from the original place and appended at last of dataframe.
park = parkinsons['status']
parkinsons.drop(['status'], axis=1, inplace = True)
parkinsons['status'] = park
parkinsons.head()
rows_count, columns_count = parkinsons.shape
print('Total Number of rows :', rows_count)
print('Total Number of columns :', columns_count)
parkinsons.columns
parkinsons.dtypes
parkinsons.info()
#null values
parkinsons.isnull().sum()
parkinsons.isnull().values.any()
parkinsons.apply(lambda x: len(x.unique()))
parkinsons.describe().transpose()
#Find Mean
parkinsons.mean()
#Find Median
parkinsons.median()
#Find Standard Deviation
parkinsons.std()
sns.pairplot(parkinsons, hue="status",palette="husl")
parkinsons.skew(axis = 0, skipna = True)
# first see distribution
sns.set_style("whitegrid")
plotCofig, axes = plt.subplots(nrows=6,ncols = 4, figsize=(20, 25))
sns.distplot(parkinsons['MDVP:Fo(Hz)'],ax=axes[0,0],color="teal")
sns.distplot(parkinsons['MDVP:Fhi(Hz)'],ax=axes[0,1],color = 'darkturquoise')
sns.distplot(parkinsons['MDVP:Flo(Hz)'],ax=axes[0,2],color="cadetblue")
sns.distplot(parkinsons['MDVP:Jitter(%)'],ax=axes[0,3],color="lightseagreen")
sns.distplot(parkinsons['MDVP:Jitter(Abs)'],ax=axes[1,0],color="darkcyan")
sns.distplot(parkinsons['MDVP:RAP'],ax=axes[1,1],color="green")
sns.distplot(parkinsons['MDVP:PPQ'],ax=axes[1,2],color="seagreen")
sns.distplot(parkinsons['Jitter:DDP'],ax=axes[1,3],color="darkorchid")
sns.distplot(parkinsons['MDVP:Shimmer'],ax=axes[2,0],color = 'darkblue')
sns.distplot(parkinsons['MDVP:Shimmer(dB)'],ax=axes[2,1],color = 'blue')
sns.distplot(parkinsons['Shimmer:APQ3'],ax=axes[2,2],color = 'royalblue')
sns.distplot(parkinsons['Shimmer:APQ5'],ax=axes[2,3],color = 'darkblue')
sns.distplot(parkinsons['MDVP:APQ'],ax=axes[3,0],color = 'yellowgreen')
sns.distplot(parkinsons['Shimmer:DDA'],ax=axes[3,1],color = 'navy')
sns.distplot(parkinsons['NHR'],ax=axes[3,2],color = 'darkcyan')
sns.distplot(parkinsons['HNR'],ax=axes[3,3],color = 'chartreuse')
sns.distplot(parkinsons['status'],ax=axes[4,0],color = 'purple')
sns.distplot(parkinsons['RPDE'],ax=axes[4,1],color = 'slateblue')
sns.distplot(parkinsons['DFA'],ax=axes[4,2],color = 'springgreen')
sns.distplot(parkinsons['spread1'],ax=axes[4,3],color = 'orange')
sns.distplot(parkinsons['spread2'],ax=axes[5,0],color = 'darkorange')
sns.distplot(parkinsons['D2'],ax=axes[5,1],color = 'olive')
sns.distplot(parkinsons['PPE'],ax=axes[5,2],color = 'indigo')
# first see distribution
sns.set_style("whitegrid")
plotCofig, axes = plt.subplots(nrows=6,ncols = 4, figsize=(20, 25))
sns.boxplot(parkinsons['MDVP:Fo(Hz)'],ax=axes[0,0],palette="YlGnBu")
sns.boxplot(parkinsons['MDVP:Fhi(Hz)'],ax=axes[0,1],palette="husl")
sns.boxplot(parkinsons['MDVP:Flo(Hz)'],ax=axes[0,2],palette="YlGnBu")
sns.boxplot(parkinsons['MDVP:Jitter(%)'],ax=axes[0,3],palette="husl")
sns.boxplot(parkinsons['MDVP:Jitter(Abs)'],ax=axes[1,0],palette="YlGnBu")
sns.boxplot(parkinsons['MDVP:RAP'],ax=axes[1,1],palette="husl")
sns.boxplot(parkinsons['MDVP:PPQ'],ax=axes[1,2],palette="YlGnBu")
sns.boxplot(parkinsons['Jitter:DDP'],ax=axes[1,3],palette="husl")
sns.boxplot(parkinsons['MDVP:Shimmer'],ax=axes[2,0],palette="YlGnBu")
sns.boxplot(parkinsons['MDVP:Shimmer(dB)'],ax=axes[2,1],palette="husl")
sns.boxplot(parkinsons['Shimmer:APQ3'],ax=axes[2,2],palette="YlGnBu")
sns.boxplot(parkinsons['Shimmer:APQ5'],ax=axes[2,3],palette="husl")
sns.boxplot(parkinsons['MDVP:APQ'],ax=axes[3,0],palette="YlGnBu")
sns.boxplot(parkinsons['Shimmer:DDA'],ax=axes[3,1],palette="husl")
sns.boxplot(parkinsons['NHR'],ax=axes[3,2],palette="YlGnBu")
sns.boxplot(parkinsons['HNR'],ax=axes[3,3],palette="husl")
sns.boxplot(parkinsons['status'],ax=axes[4,0],palette="YlGnBu")
sns.boxplot(parkinsons['RPDE'],ax=axes[4,1],palette="husl")
sns.boxplot(parkinsons['DFA'],ax=axes[4,2],palette="YlGnBu")
sns.boxplot(parkinsons['spread1'],ax=axes[4,3],palette="husl")
sns.boxplot(parkinsons['spread2'],ax=axes[5,0],palette="YlGnBu")
sns.boxplot(parkinsons['D2'],ax=axes[5,1],palette="husl")
sns.boxplot(parkinsons['PPE'],ax=axes[5,2],palette="YlGnBu")
## check class balance
sns.countplot(x='status', data = parkinsons,palette="husl")
# as we can see, datas are not balanced with status class which means there 147 records who having pankison.
parkinsons.status.value_counts()
cor=parkinsons.corr()
cor
# First, looking correlation between features:
plt.figure(figsize=(20, 22))
plt.title('Correlation of Attributes', y=1.05, size=19)
sns.heatmap(cor,annot=True,linewidths=.5,cmap="YlGnBu")
# first see distribution
sns.set_style("whitegrid")
plotCofig, axes = plt.subplots(nrows=6,ncols = 4, figsize=(20, 25))
sns.boxplot(y = parkinsons['MDVP:Fo(Hz)'],x = parkinsons['status'],ax=axes[0,0],palette="husl")
sns.boxplot(y = parkinsons['MDVP:Fhi(Hz)'],x = parkinsons['status'],ax=axes[0,1],palette="husl")
sns.boxplot(y = parkinsons['MDVP:Flo(Hz)'],x = parkinsons['status'],ax=axes[0,2],palette="husl")
sns.boxplot(y = parkinsons['MDVP:Jitter(%)'],x = parkinsons['status'],ax=axes[0,3],palette="husl")
sns.boxplot(y = parkinsons['MDVP:Jitter(Abs)'],x = parkinsons['status'],ax=axes[1,0],palette="husl")
sns.boxplot(y = parkinsons['MDVP:RAP'],x = parkinsons['status'],ax=axes[1,1],palette="husl")
sns.boxplot(y = parkinsons['MDVP:PPQ'],x = parkinsons['status'],ax=axes[1,2],palette="husl")
sns.boxplot(y = parkinsons['Jitter:DDP'],x = parkinsons['status'],ax=axes[1,3],palette="husl")
sns.boxplot(y = parkinsons['MDVP:Shimmer'],x = parkinsons['status'],ax=axes[2,0],palette="husl")
sns.boxplot(y = parkinsons['MDVP:Shimmer(dB)'],x = parkinsons['status'],ax=axes[2,1],palette="husl")
sns.boxplot(y = parkinsons['Shimmer:APQ3'],x = parkinsons['status'],ax=axes[2,2],palette="husl")
sns.boxplot(y = parkinsons['Shimmer:APQ5'],x = parkinsons['status'],ax=axes[2,3],palette="husl")
sns.boxplot(y = parkinsons['MDVP:APQ'],x = parkinsons['status'],ax=axes[3,0],palette="husl")
sns.boxplot(y = parkinsons['Shimmer:DDA'],x = parkinsons['status'],ax=axes[3,1],palette="husl")
sns.boxplot(y = parkinsons['NHR'],x = parkinsons['status'],ax=axes[3,2],palette="husl")
sns.boxplot(y = parkinsons['HNR'],x = parkinsons['status'],ax=axes[3,3],palette="husl")
sns.boxplot(y = parkinsons['RPDE'],x = parkinsons['status'],ax=axes[4,0],palette="husl")
sns.boxplot(y = parkinsons['DFA'],x = parkinsons['status'],ax=axes[4,1],palette="husl")
sns.boxplot(y = parkinsons['spread1'],x = parkinsons['status'],ax=axes[4,2],palette="husl")
sns.boxplot(y = parkinsons['spread2'],x = parkinsons['status'],ax=axes[4,3],palette="husl")
sns.boxplot(y = parkinsons['D2'],x = parkinsons['status'],ax=axes[5,1],palette="husl")
sns.boxplot(y = parkinsons['PPE'],x = parkinsons['status'],ax=axes[5,2],palette="husl")
park_counts = pd.DataFrame(parkinsons["status"].value_counts()).reset_index()
park_counts.columns =["Labels","status"]
park_counts
fig1, ax1 = plt.subplots(figsize=(6,6))
explode = (0, 0.30)
ax1.pie(park_counts["status"], explode=explode, labels=park_counts["Labels"], autopct='%1.1f%%',
shadow=True, startangle=70)
ax1.axis('equal')
plt.title("Parkinson's Disease Percentage")
plt.show()
parkinsons.head(1)
park=parkinsons.drop(['name'], axis =1 )
park.shape
park.head(1)
park["status"].value_counts(normalize=True)
# split data into train and test set into 70 : 30
X=park.loc[:,['MDVP:Fo(Hz)','MDVP:Fhi(Hz)','MDVP:Flo(Hz)','MDVP:Jitter(%)','MDVP:Jitter(Abs)',
'MDVP:RAP','MDVP:PPQ','Jitter:DDP','MDVP:Shimmer','MDVP:Shimmer(dB)','Shimmer:APQ3',
'Shimmer:APQ5','MDVP:APQ','Shimmer:DDA','NHR','HNR','RPDE','DFA','spread1','spread2','D2','PPE']]
y=park['status']
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3)
X_train.shape
X_test.shape
val=parkinsons.isnull().values.any()
if val==True:
print("Missing values present : ", loan.isnull().values.sum())
loan=loan.dropna()
else:
print("No missing values present")
#Missing value Visualization
sns.heatmap(parkinsons.isna(), yticklabels=False, cbar=False, cmap="rocket_r")
clf = LogisticRegression()
clf.fit(X_train, y_train)
# Fit the model on 30%
# Building a Logistic Regression model on train data
lrg_model = LogisticRegression()
# fitting the model
lrg_model.fit(X_train, y_train)
# predict the response
lrg_y_predict = lrg_model.predict(X_test)
# evaluate accuracy
lrg_score = lrg_model.score(X_test, y_test)
lrg_accuracy = accuracy_score(y_test, lrg_y_predict) # Accuracy of Naïve Bayes Classifier model
lrg_confusion_matrix = metrics.confusion_matrix(y_test, lrg_y_predict)
# check the accuracy on the training set
print('Accuracy of Training Data : ',lrg_model.score(X_train, y_train))
print('Accuracy of Testing Data : ',lrg_model.score(X_test, y_test))
print('----------------------Final Analysis of Logistic Regression----------------------------\n')
# Accuracy
print('Logistic Regression Model Accuracy Score : %f' % lrg_score)
# Confusion Matrix
print('\nLogistic Regression Confusion Matrix : \n', lrg_confusion_matrix)
print('\nTrue Possitive = ', lrg_confusion_matrix[1][1])
print('True Negative = ', lrg_confusion_matrix[0][0])
print('False Possitive = ', lrg_confusion_matrix[0][1])
print('False Negative = ', lrg_confusion_matrix[1][0])
print('\n Logistic Regression classification Report : \n',metrics.classification_report(y_test, lrg_y_predict))
lrg_conf_table = lrg_confusion_matrix
a1 = (lrg_conf_table[0,0] + lrg_conf_table[1,1]) / (lrg_conf_table[0,0] + lrg_conf_table[0,1] + lrg_conf_table[1,0] + lrg_conf_table[1,1])
p1 = lrg_conf_table[1,1] / (lrg_conf_table[1,1] + lrg_conf_table[0,1])
r1 = lrg_conf_table[1,1] / (lrg_conf_table[1,1] + lrg_conf_table[1,0])
f1 = (2 * p1 * r1) / (p1 + r1)
print("\nAccuracy : ",round(a1,2))
print("Precision : ",round(p1,2))
print("Recall : ",round(r1,2))
print("F1 score : ",round(f1,2))
plt.clf()
plt.figure(figsize = (12,8))
plt.imshow(lrg_confusion_matrix, interpolation = 'nearest', cmap = "GnBu")
classNames = ['Parkinsons','Healthy']
plt.title("Confusion Matrix Logistic Regression Model - Test Data")
plt.ylabel("Actual Label")
plt.xlabel("Predicted Label")
tick_marks = np.arange(len(classNames))
plt.xticks(tick_marks, classNames, rotation=45)
plt.yticks(tick_marks, classNames)
s = [['TN','FP'],['FN','TP']]
for i in range(2):
for j in range(2):
plt.text(j,i,str(s[i][j])+" = "+str(lrg_confusion_matrix[i][j]))
plt.show()
clf1 = GaussianNB()
clf1.fit(X_train, y_train)
# Building a Naïve Bayes Classifier on train data
gnb_model = GaussianNB()
# fitting the model
gnb_model.fit(X_train, y_train)
# predict the response
gnb_y_predict = gnb_model.predict(X_test)
# evaluate accuracy
gnb_score = gnb_model.score(X_test, y_test)
gnb_accuracy = accuracy_score(y_test, gnb_y_predict) # Accuracy of Naïve Bayes Classifier model
gnb_confusion_matrix = metrics.confusion_matrix(y_test, gnb_y_predict)
# check the accuracy on the training set
print('Accuracy of Training Data : ',gnb_model.score(X_train, y_train))
print('Accuracy of Testing Data : ',gnb_model.score(X_test, y_test))
print('----------------------Final Analysis of Naïve Bayes----------------------------\n')
print('Naïve Bayes Model Accuracy Score: %f' % gnb_accuracy)
print('\nNaïve Bayes Confusion Matrix: \n', gnb_confusion_matrix)
print('\nTrue Possitive = ', gnb_confusion_matrix[1][1])
print('True Negative = ', gnb_confusion_matrix[0][0])
print('False Possitive = ', gnb_confusion_matrix[0][1])
print('False Negative = ', gnb_confusion_matrix[1][0])
print('\n Gaussian Naïve Bayes classification Report : \n',metrics.classification_report(y_test, gnb_y_predict))
gnb_conf_table = gnb_confusion_matrix
a2 = (gnb_conf_table[0,0] + gnb_conf_table[1,1]) / (gnb_conf_table[0,0] + gnb_conf_table[0,1] + gnb_conf_table[1,0] + gnb_conf_table[1,1])
p2 = gnb_conf_table[1,1] / (gnb_conf_table[1,1] + gnb_conf_table[0,1])
r2 = gnb_conf_table[1,1] / (gnb_conf_table[1,1] + gnb_conf_table[1,0])
f2 = (2 * p2 * r2) / (p2 + r2)
print("\nAccuracy : ",round(a2,2))
print("Precision : ",round(p2,2))
print("Recall : ",round(r2,2))
print("F1 score : ",round(f2,2))
plt.clf()
plt.figure(figsize = (12,8))
plt.imshow(gnb_confusion_matrix, interpolation = 'nearest', cmap = "GnBu")
classNames = ['Parkinsons','Healthy']
plt.title("Confusion Matrix Naïve Bayes Model - Test Data")
plt.ylabel("Actual Label")
plt.xlabel("Predicted Label")
tick_marks = np.arange(len(classNames))
plt.xticks(tick_marks, classNames, rotation=45)
plt.yticks(tick_marks, classNames)
s = [['TN','FP'],['FN','TP']]
for i in range(2):
for j in range(2):
plt.text(j,i,str(s[i][j])+" = "+str(gnb_confusion_matrix[i][j]))
plt.show()
clf2 = KNeighborsClassifier()
clf2.fit(X_train, y_train)
X_std = pd.DataFrame(StandardScaler().fit_transform(park))
X_std.columns = park.columns
# Building a K-NN Classifier on train data
# instantiate learning model (k = 1)
knn = KNeighborsClassifier(n_neighbors = 1)
# fitting the model
knn.fit(X_train, y_train)
# predict the response
y1_pred = knn.predict(X_test)
# evaluate accuracy
print('Accuracy when k = 1 : ',accuracy_score(y_test, y1_pred))
# instantiate learning model (k = 3)
knn = KNeighborsClassifier(n_neighbors=3)
# fitting the model
knn.fit(X_train, y_train)
# predict the response
y3_pred = knn.predict(X_test)
# evaluate accuracy
print('Accuracy when k = 3 : ',accuracy_score(y_test, y3_pred))
# instantiate learning model (k = 5)
knn = KNeighborsClassifier(n_neighbors=5)
# fitting the model
knn.fit(X_train, y_train)
# predict the response
y5_pred = knn.predict(X_test)
# evaluate accuracy
print('Accuracy when k = 5 : ',accuracy_score(y_test, y5_pred))
# instantiate learning model (k = 7)
knn = KNeighborsClassifier(n_neighbors=7)
# fitting the model
knn.fit(X_train, y_train)
# predict the response
y7_pred = knn.predict(X_test)
# evaluate accuracy
print('Accuracy when k = 7 : ',accuracy_score(y_test, y7_pred))
myList = list(range(1,20))
# subsetting just the odd ones
neighbors = list(filter(lambda x: x % 2 != 0, myList))
ac_scores = []
# perform accuracy metrics for values from 1,3,5....19
for k in neighbors:
knn = KNeighborsClassifier(n_neighbors=k)
knn.fit(X_train, y_train)
# predict the response
y_pred = knn.predict(X_test)
# evaluate accuracy
scores = accuracy_score(y_test, y_pred)
ac_scores.append(scores)
# changing to misclassification error
MSE = [1 - x for x in ac_scores]
# determining best k
optimal_k = neighbors[MSE.index(min(MSE))]
print("The optimal number of neighbors is %d" % optimal_k)
#Plot misclassification error vs k (with k value on X-axis) using matplotlib.
import matplotlib.pyplot as plt
# plot misclassification error vs k
plt.plot(neighbors, MSE, color = 'darkmagenta')
plt.xlabel('Number of Neighbors K')
plt.ylabel('Misclassification Error')
plt.show()
# Building a K-NN Classifier on train data
#Use k=optimal_k as the final model for prediction
knn_model = KNeighborsClassifier(n_neighbors = optimal_k)
# fitting the model
knn_model.fit(X_train, y_train)
# predict the response
knn_y_predict = knn_model.predict(X_test)
# evaluate accuracy
knn_score = knn_model.score(X_test, y_test)
knn_accuracy = accuracy_score(y_test, knn_y_predict) #Accuracy of K-NN model
knn_confusion_matrix = metrics.confusion_matrix(y_test, knn_y_predict)
# check the accuracy on the training set
print('Accuracy of Training Data : ',knn_model.score(X_train, y_train))
print('Accuracy of Testing Data : ',knn_model.score(X_test, y_test))
print('----------------------Final Analysis of K-NN----------------------------\n')
print('K-NN Model Accuracy Score without Experience: %f' % knn_accuracy)
print('\nK-NN Confusion Matrix Without Experience: \n', knn_confusion_matrix)
print('\nTrue Possitive = ', knn_confusion_matrix[1][1])
print('True Negative = ', knn_confusion_matrix[0][0])
print('False Possitive = ', knn_confusion_matrix[0][1])
print('False Negative = ', knn_confusion_matrix[1][0])
print('\nK-NN classification Report : \n',metrics.classification_report(y_test, knn_y_predict))
knn_conf_table = knn_confusion_matrix
a3 = (knn_conf_table[0,0] + knn_conf_table[1,1]) / (knn_conf_table[0,0] + knn_conf_table[0,1] + knn_conf_table[1,0] + knn_conf_table[1,1])
p3 = knn_conf_table[1,1] / (knn_conf_table[1,1] + knn_conf_table[0,1])
r3 = knn_conf_table[1,1] / (knn_conf_table[1,1] + knn_conf_table[1,0])
f3 = (2 * p3 * r3) / (p3 + r3)
print("\nAccuracy : ",round(a3,2))
print("Precision : ",round(p3,2))
print("Recall : ",round(r3,2))
print("F1 score : ",round(f3,2))
plt.clf()
plt.figure(figsize = (12,8))
plt.imshow(knn_confusion_matrix, interpolation = 'nearest', cmap = "GnBu")
classNames = ['Parkinsons','Healthy']
plt.title("Confusion Matrix K-NN Model - Test Data")
plt.ylabel("Actual Label")
plt.xlabel("Predicted Label")
tick_marks = np.arange(len(classNames))
plt.xticks(tick_marks, classNames, rotation=45)
plt.yticks(tick_marks, classNames)
s = [['TN','FP'],['FN','TP']]
for i in range(2):
for j in range(2):
plt.text(j,i,str(s[i][j])+" = "+str(knn_confusion_matrix[i][j]))
plt.show()
# Building a Support Vector Machine on train data
clf3 = SVC(kernel='linear')
clf3.fit(X_train, y_train)
# Building a Support Vector Machine on train data
svc_model= SVC(kernel='linear')
# fitting the model
svc_model.fit(X_train, y_train)
# predict the response
svm_y_predict = svc_model.predict(X_test)
# evaluate accuracy
svm_score = svc_model.score(X_test, y_test)
svm_accuracy = accuracy_score(y_test, svm_y_predict) #Accuracy of SVM model
svm_confusion_matrix = metrics.confusion_matrix(y_test, svm_y_predict)
# check the accuracy on the training set
print('Accuracy of Training Data : ',svc_model.score(X_train, y_train))
print('Accuracy of Testing Data : ',svc_model.score(X_test, y_test))
print('----------------------Final Analysis of SVM----------------------------\n')
print('SVM Model Accuracy Score : %f' % svm_accuracy)
print('\nSVM Confusion Matrix : \n', svm_confusion_matrix)
print('\nTrue Possitive = ', svm_confusion_matrix[1][1])
print('True Negative = ', svm_confusion_matrix[0][0])
print('False Possitive = ', svm_confusion_matrix[0][1])
print('False Negative = ', svm_confusion_matrix[1][0])
print('\nSVM classification Report : \n',metrics.classification_report(y_test, svm_y_predict))
svm_conf_table = svm_confusion_matrix
a4 = (svm_conf_table[0,0] + svm_conf_table[1,1]) / (svm_conf_table[0,0] + svm_conf_table[0,1] + svm_conf_table[1,0] + svm_conf_table[1,1])
p4 = svm_conf_table[1,1] / (svm_conf_table[1,1] + svm_conf_table[0,1])
r4 = svm_conf_table[1,1] / (svm_conf_table[1,1] + svm_conf_table[1,0])
f4 = (2 * p4 * r4) / (p4 + r4)
print("\nAccuracy : ",round(a4,2))
print("Precision : ",round(p4,2))
print("Recall : ",round(r4,2))
print("F1 score : ",round(f4,2))
plt.clf()
plt.figure(figsize = (12,8))
plt.imshow(svm_confusion_matrix, interpolation = 'nearest', cmap = "GnBu")
classNames = ['Parkinsons','Healthy']
plt.title("Confusion Matrix SVM Model - Test Data")
plt.ylabel("Actual Label")
plt.xlabel("Predicted Label")
tick_marks = np.arange(len(classNames))
plt.xticks(tick_marks, classNames, rotation=45)
plt.yticks(tick_marks, classNames)
s = [['TN','FP'],['FN','TP']]
for i in range(2):
for j in range(2):
plt.text(j,i,str(s[i][j])+" = "+str(svm_confusion_matrix[i][j]))
plt.show()
clf4 = DecisionTreeClassifier()
clf4.fit(X_train, y_train)
# Building a Decision Tree Classifier on train data
dt_model = DecisionTreeClassifier(criterion='entropy')
# fitting the model
dt_model.fit(X_train,y_train)
# predict the response
dt_y_predict = dt_model.predict(X_test)
# evaluate accuracy
dt_score = dt_model.score(X_test, y_test)
dt_accuracy = accuracy_score(y_test, dt_y_predict) #Accuracy of Decision Tree model
dt_confusion_matrix = metrics.confusion_matrix(y_test, dt_y_predict)
# check the accuracy on the training set
print('Accuracy of Training Data : ',dt_model.score(X_train, y_train))
print('Accuracy of Testing Data : ',dt_model.score(X_test, y_test))
print('----------------------Final Analysis of Decision Tree Classifier----------------------------\n')
print('Decision Tree Classifier (Entropy) Model Accuracy Score : %f' % dt_accuracy)
print('\nDecision Tree Classifier Confusion Matrix : \n', dt_confusion_matrix)
print('\nTrue Possitive = ', dt_confusion_matrix[1][1])
print('True Negative = ', dt_confusion_matrix[0][0])
print('False Possitive = ', dt_confusion_matrix[0][1])
print('False Negative = ', dt_confusion_matrix[1][0])
print('\nDecision Tree Classifier Report : \n',metrics.classification_report(y_test, dt_y_predict))
dt_conf_table = dt_confusion_matrix
a5 = (dt_conf_table[0,0] + dt_conf_table[1,1]) / (dt_conf_table[0,0] + dt_conf_table[0,1] + dt_conf_table[1,0] + dt_conf_table[1,1])
p5 = dt_conf_table[1,1] / (dt_conf_table[1,1] + dt_conf_table[0,1])
r5 = dt_conf_table[1,1] / (dt_conf_table[1,1] + dt_conf_table[1,0])
f5 = (2 * p5 * r5) / (p5 + r5)
print("\nAccuracy : ",round(a5,2))
print("Precision : ",round(p5,2))
print("Recall : ",round(r5,2))
print("F1 score : ",round(f5,2))
plt.clf()
plt.figure(figsize = (12,8))
plt.imshow(dt_confusion_matrix, interpolation = 'nearest', cmap = "GnBu")
classNames = ['Parkinsons','Healthy']
plt.title("Confusion Matrix DTree Classifier (Entropy) Model - Test Data")
plt.ylabel("Actual Label")
plt.xlabel("Predicted Label")
tick_marks = np.arange(len(classNames))
plt.xticks(tick_marks, classNames, rotation=45)
plt.yticks(tick_marks, classNames)
s = [['TN','FP'],['FN','TP']]
for i in range(2):
for j in range(2):
plt.text(j,i,str(s[i][j])+" = "+str(dt_confusion_matrix[i][j]))
plt.show()
from sklearn.tree import export_graphviz
train_char_label = ['No', 'Yes']
Park_Tree_File = open('Parkinsons_tree.dot','w')
dot_data = export_graphviz(dt_model, out_file=Park_Tree_File, feature_names = list(X_train), class_names = list(train_char_label))
Park_Tree_File.close()
from os import system
from IPython.display import Image
#Works only if "dot" command works on you machine
retCode = system("dot -Tpng Parkinsons_tree.dot -o Parkinsons_tree.png")
if(retCode>0):
print("system command returning error: "+str(retCode))
else:
display(Image("Parkinsons_tree.png"))
# Building a Decision Tree Classifier on train data
dt1_model = DecisionTreeClassifier(criterion = 'gini', random_state=1)
# fitting the model
dt1_model.fit(X_train, y_train)
# predict the response
dt1_y_predict = dt1_model.predict(X_test)
# evaluate accuracy
dt1_score = dt1_model.score(X_test, y_test)
dt1_accuracy = accuracy_score(y_test, dt1_y_predict) #Accuracy of Decision Tree model
dt1_confusion_matrix = metrics.confusion_matrix(y_test, dt1_y_predict)
# check the accuracy on the training set
print('Accuracy of Training Data : ',dt1_model.score(X_train, y_train))
print('Accuracy of Testing Data : ',dt1_model.score(X_test, y_test))
print('----------------------Final Analysis of Decision Tree Classifier----------------------------\n')
print('Decision Tree Classifier (Gini) Model Accuracy Score : %f' % dt1_accuracy)
print('\nDecision Tree Classifier Confusion Matrix : \n', dt1_confusion_matrix)
print('\nTrue Possitive = ', dt1_confusion_matrix[1][1])
print('True Negative = ', dt1_confusion_matrix[0][0])
print('False Possitive = ', dt1_confusion_matrix[0][1])
print('False Negative = ', dt1_confusion_matrix[1][0])
print('\nDecision Tree Classifier Report : \n',metrics.classification_report(y_test, dt1_y_predict))
dt1_conf_table = dt1_confusion_matrix
a6 = (dt1_conf_table[0,0] + dt1_conf_table[1,1]) / (dt1_conf_table[0,0] + dt1_conf_table[0,1] + dt1_conf_table[1,0] + dt1_conf_table[1,1])
p6 = dt1_conf_table[1,1] / (dt1_conf_table[1,1] + dt1_conf_table[0,1])
r6 = dt1_conf_table[1,1] / (dt1_conf_table[1,1] + dt1_conf_table[1,0])
f6 = (2 * p6 * r6) / (p6 + r6)
print("\nAccuracy : ",round(a6,2))
print("Precision : ",round(p6,2))
print("Recall : ",round(r6,2))
print("F1 score : ",round(f6,2))
plt.clf()
plt.figure(figsize = (12,8))
plt.imshow(dt1_confusion_matrix, interpolation = 'nearest', cmap = "GnBu")
classNames = ['Parkinsons','Healthy']
plt.title("Confusion Matrix DTree Classifier (Gini) Model - Test Data")
plt.ylabel("Actual Label")
plt.xlabel("Predicted Label")
tick_marks = np.arange(len(classNames))
plt.xticks(tick_marks, classNames, rotation=45)
plt.yticks(tick_marks, classNames)
s = [['TN','FP'],['FN','TP']]
for i in range(2):
for j in range(2):
plt.text(j,i,str(s[i][j])+" = "+str(dt1_confusion_matrix[i][j]))
plt.show()
from sklearn.tree import export_graphviz
train_char_label = ['No', 'Yes']
Park1_Tree_File = open('Parkinsons1_tree.dot','w')
dot_data = export_graphviz(dt1_model, out_file=Park1_Tree_File, feature_names = list(X_train), class_names = list(train_char_label))
Park1_Tree_File.close()
from os import system
from IPython.display import Image
#Works only if "dot" command works on you machine
retCode = system("dot -Tpng Parkinsons1_tree.dot -o Parkinsons1_tree.png")
if(retCode>0):
print("system command returning error: "+str(retCode))
else:
display(Image("Parkinsons1_tree.png"))
# compare ensemble to each baseline classifier
from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.ensemble import StackingClassifier
from matplotlib import pyplot
# get the dataset
def get_dataset():
X_train, y_train
return X_train, y_train
# get a stacking ensemble of models
def get_stacking():
# define the base models
level0 = list()
level0.append(('Logistic Regression', LogisticRegression()))
level0.append(('K - NN', KNeighborsClassifier()))
level0.append(('SVM', SVC()))
level0.append(('Naïve Bayes', GaussianNB()))
level0.append(('Decision Tree - Entopy', DecisionTreeClassifier(criterion='entropy')))
level0.append(('Decision Tree - Gini', DecisionTreeClassifier(criterion='gini')))
# define meta learner model
level1 = LogisticRegression()
# define the stacking ensemble
model = StackingClassifier(estimators=level0, final_estimator=level1, cv=6)
return model
# get a list of models to evaluate
def get_models():
models = dict()
models['Logistic Regression'] = LogisticRegression()
models['K-NN'] = KNeighborsClassifier()
models['SVM'] = SVC()
models['Naïve Bayes'] = GaussianNB()
models['Decision Tree - Entropy'] = DecisionTreeClassifier(criterion='entropy')
models['Decision Tree - Gini'] = DecisionTreeClassifier(criterion='gini')
models['Stacking'] = get_stacking()
return models
# evaluate a give model using cross-validation
def evaluate_model(model, X_train, y_train):
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
scores = cross_val_score(model, X_train, y_train, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
return scores
# define dataset
X, y = get_dataset()
# get the models to evaluate
models = get_models()
# evaluate the models and store results
results, names = list(), list()
for name, model in models.items():
scores = evaluate_model(model, X_train, y_train)
results.append(scores)
names.append(name)
print('>%s %.3f (%.3f)' % (name, mean(scores), std(scores)))
# plot model performance for comparison
plt.figure(figsize=(20, 15))
plt.title('Comparison of Various Models', y=1.05, size=19)
box = pyplot.boxplot(results, labels=names, showmeans=True,patch_artist=True)
colors = ['seagreen','orange', 'olive', 'purple', 'red', 'yellow','lightblue']
for patch, color in zip(box['boxes'], colors):
patch.set_facecolor(color)
pyplot.show()
# evaluate the models and store results
results, names = list(), list()
for name, model in models.items():
scores = evaluate_model(model, X_test, y_test)
results.append(scores)
names.append(name)
print('>%s %.3f (%.3f)' % (name, mean(scores), std(scores)))
# plot model performance for comparison
plt.figure(figsize=(20, 15))
plt.title('Comparison of Various Models for Test Data', y=1.05, size=19)
box = pyplot.boxplot(results, labels=names, showmeans=True, patch_artist=True)
colors = ['seagreen','orange', 'olive', 'purple', 'red', 'yellow','lightblue']
for patch, color in zip(box['boxes'], colors):
patch.set_facecolor(color)
pyplot.show()
rfcl = RandomForestClassifier(n_estimators = 50, random_state=1,max_features=12)
rfcl = rfcl.fit(X_train, y_train)
# Building a Random Forest Classifier model on train data
rfc_model = RandomForestClassifier(n_estimators = 50, random_state=1,max_features=12)
# fitting the model
rfc_model.fit(X_train, y_train)
# predict the response
rfc_y_predict = rfc_model.predict(X_test)
# evaluate accuracy
rfc_model_score = rfc_model.score(X_test, y_test)
rfc_accuracy = rfc_model_score # Accuracy of Random Forest Classifier model
rfc_confusion_matrix = metrics.confusion_matrix(y_test, rfc_y_predict)
# check the accuracy on the training set
print('Accuracy of Training Data : ',rfc_model.score(X_train, y_train))
print('Accuracy of Testing Data : ',rfc_model.score(X_test, y_test))
print('----------------------Final Analysis of Random Forest Classifier----------------------------\n')
print('Random Forest Classifier Model Accuracy Score : %f' % rfc_accuracy)
print('\nRandom Forest Classifier Confusion Matrix : \n', rfc_confusion_matrix)
print('\nTrue Possitive = ', rfc_confusion_matrix[1][1])
print('True Negative = ', rfc_confusion_matrix[0][0])
print('False Possitive = ', rfc_confusion_matrix[0][1])
print('False Negative = ', rfc_confusion_matrix[1][0])
print('\n Random Forest Classifier Model classification Report : \n',metrics.classification_report(y_test, rfc_y_predict))
rfc_conf_table = rfc_confusion_matrix
a7 = (rfc_conf_table[0,0] + rfc_conf_table[1,1]) / (rfc_conf_table[0,0] + rfc_conf_table[0,1] + rfc_conf_table[1,0] + rfc_conf_table[1,1])
p7 = rfc_conf_table[1,1] / (rfc_conf_table[1,1] + rfc_conf_table[0,1])
r7 = rfc_conf_table[1,1] / (rfc_conf_table[1,1] + rfc_conf_table[1,0])
f7 = (2 * p7 * r7) / (p7 + r7)
print("\nAccuracy : ",round(a7,2))
print("Precision : ",round(p7,2))
print("Recall : ",round(r7,2))
print("F1 score : ",round(f7,2))
plt.clf()
plt.figure(figsize = (12,8))
plt.imshow(rfc_confusion_matrix, interpolation = 'nearest', cmap = "GnBu")
classNames = ['Parkinsons','Healty']
plt.title("Confusion Matrix Random Forest Classifier Model")
plt.ylabel("Actual Label")
plt.xlabel("Predicted Label")
tick_marks = np.arange(len(classNames))
plt.xticks(tick_marks, classNames, rotation=45)
plt.yticks(tick_marks, classNames)
s = [['TN','FP'],['FN','TP']]
for i in range(2):
for j in range(2):
plt.text(j,i,str(s[i][j])+" = "+str(rfc_confusion_matrix[i][j]))
plt.show()
bgcl = BaggingClassifier(base_estimator=dt_model, n_estimators=50,random_state=1)
#bgcl = BaggingClassifier(n_estimators=50,random_state=1)
bgcl = bgcl.fit(X_train, y_train)
# Building a Bagging Classifier model on train data
bgcl_model = BaggingClassifier(base_estimator=dt_model, n_estimators=50,random_state=1)
# fitting the model
bgcl_model.fit(X_train, y_train)
# predict the response
bgcl_y_predict = bgcl_model.predict(X_test)
# evaluate accuracy
bgcl_model_score = bgcl_model.score(X_test, y_test)
bgcl_accuracy = bgcl_model_score # Accuracy of Bagging Classifier model
bgcl_confusion_matrix = metrics.confusion_matrix(y_test, bgcl_y_predict)
# check the accuracy on the training set
print('Accuracy of Training Data : ',bgcl_model.score(X_train, y_train))
print('Accuracy of Testing Data : ',bgcl_model.score(X_test, y_test))
print('----------------------Final Analysis of Bagging Classifier----------------------------\n')
print('Bagging Classifier Model Accuracy Score : %f' % bgcl_accuracy)
print('\nBagging Classifier Confusion Matrix : \n', bgcl_confusion_matrix)
print('\nTrue Possitive = ', bgcl_confusion_matrix[1][1])
print('True Negative = ', bgcl_confusion_matrix[0][0])
print('False Possitive = ', bgcl_confusion_matrix[0][1])
print('False Negative = ', bgcl_confusion_matrix[1][0])
print('\n Bagging Classifier Model classification Report : \n',metrics.classification_report(y_test, bgcl_y_predict))
bgcl_conf_table = bgcl_confusion_matrix
a8 = (bgcl_conf_table[0,0] + bgcl_conf_table[1,1]) / (bgcl_conf_table[0,0] + bgcl_conf_table[0,1] + bgcl_conf_table[1,0] + bgcl_conf_table[1,1])
p8 = bgcl_conf_table[1,1] / (bgcl_conf_table[1,1] + bgcl_conf_table[0,1])
r8 = bgcl_conf_table[1,1] / (bgcl_conf_table[1,1] + bgcl_conf_table[1,0])
f8 = (2 * p8 * r8) / (p8 + r8)
print("\nAccuracy : ",round(a8,2))
print("Precision : ",round(p8,2))
print("Recall : ",round(r8,2))
print("F1 score : ",round(f8,2))
plt.clf()
plt.figure(figsize = (12,8))
plt.imshow(bgcl_confusion_matrix, interpolation = 'nearest', cmap = "GnBu")
classNames = ['Parkinsons','Healty']
plt.title("Confusion Matrix Bagging Classifier Model")
plt.ylabel("Actual Label")
plt.xlabel("Predicted Label")
tick_marks = np.arange(len(classNames))
plt.xticks(tick_marks, classNames, rotation=45)
plt.yticks(tick_marks, classNames)
s = [['TN','FP'],['FN','TP']]
for i in range(2):
for j in range(2):
plt.text(j,i,str(s[i][j])+" = "+str(bgcl_confusion_matrix[i][j]))
plt.show()
abcl = AdaBoostClassifier(n_estimators=10, random_state=1)
#abcl = AdaBoostClassifier( n_estimators=50,random_state=1)
abcl = abcl.fit(X_train, y_train)
# Building a AdaBoosting Classifier model on train data
abcl_model = AdaBoostClassifier(n_estimators=10, random_state=1)
# fitting the model
abcl_model.fit(X_train, y_train)
# predict the response
abcl_y_predict = abcl_model.predict(X_test)
# evaluate accuracy
abcl_model_score = abcl_model.score(X_test, y_test)
abcl_accuracy = abcl_model_score # Accuracy of AdaBoosting Classifier model
abcl_confusion_matrix = metrics.confusion_matrix(y_test, abcl_y_predict)
# check the accuracy on the training set
print('Accuracy of Training Data : ',abcl_model.score(X_train, y_train))
print('Accuracy of Testing Data : ',abcl_model.score(X_test, y_test))
print('----------------------Final Analysis of AdaBoosting Classifier----------------------------\n')
print('AdaBoosting Classifier Model Accuracy Score : %f' % abcl_accuracy)
print('\nAdaBoosting Classifier Confusion Matrix : \n', abcl_confusion_matrix)
print('\nTrue Possitive = ', abcl_confusion_matrix[1][1])
print('True Negative = ', abcl_confusion_matrix[0][0])
print('False Possitive = ', abcl_confusion_matrix[0][1])
print('False Negative = ', abcl_confusion_matrix[1][0])
print('\n AdaBoosting Classifier Model classification Report : \n',metrics.classification_report(y_test, abcl_y_predict))
abcl_conf_table = abcl_confusion_matrix
a9 = (abcl_conf_table[0,0] + abcl_conf_table[1,1]) / (abcl_conf_table[0,0] + abcl_conf_table[0,1] + abcl_conf_table[1,0] + abcl_conf_table[1,1])
p9 = abcl_conf_table[1,1] / (abcl_conf_table[1,1] + abcl_conf_table[0,1])
r9 = abcl_conf_table[1,1] / (abcl_conf_table[1,1] + abcl_conf_table[1,0])
f9 = (2 * p9 * r9) / (p9 + r9)
print("\nAccuracy : ",round(a9,2))
print("Precision : ",round(p9,2))
print("Recall : ",round(r9,2))
print("F1 score : ",round(f9,2))
plt.clf()
plt.figure(figsize = (12,8))
plt.imshow(abcl_confusion_matrix, interpolation = 'nearest', cmap = "GnBu")
classNames = ['Parkinsons','Healty']
plt.title("Confusion Matrix AdaBoosting Classifier Model")
plt.ylabel("Actual Label")
plt.xlabel("Predicted Label")
tick_marks = np.arange(len(classNames))
plt.xticks(tick_marks, classNames, rotation=45)
plt.yticks(tick_marks, classNames)
s = [['TN','FP'],['FN','TP']]
for i in range(2):
for j in range(2):
plt.text(j,i,str(s[i][j])+" = "+str(abcl_confusion_matrix[i][j]))
plt.show()
gbcl = GradientBoostingClassifier(n_estimators = 50,random_state=1)
gbcl = gbcl.fit(X_train, y_train)
# Building a GradientBoost Classifier model on train data
gbcl_model = GradientBoostingClassifier(n_estimators = 50,random_state=1)
# fitting the model
gbcl_model.fit(X_train, y_train)
# predict the response
gbcl_y_predict = gbcl_model.predict(X_test)
# evaluate accuracy
gbcl_model_score = gbcl_model.score(X_test, y_test)
gbcl_accuracy = gbcl_model_score # Accuracy of GradientBoost Classifier model
gbcl_confusion_matrix = metrics.confusion_matrix(y_test, gbcl_y_predict)
# check the accuracy on the training set
print('Accuracy of Training Data : ',gbcl_model.score(X_train, y_train))
print('Accuracy of Testing Data : ',gbcl_model.score(X_test, y_test))
print('----------------------Final Analysis of GradientBoost Classifier----------------------------\n')
print('GradientBoost Classifier Model Accuracy Score : %f' % gbcl_accuracy)
print('\nGradientBoost Classifier Confusion Matrix : \n', gbcl_confusion_matrix)
print('\nTrue Possitive = ', gbcl_confusion_matrix[1][1])
print('True Negative = ', gbcl_confusion_matrix[0][0])
print('False Possitive = ', gbcl_confusion_matrix[0][1])
print('False Negative = ', gbcl_confusion_matrix[1][0])
print('\n GradientBoost Classifier Model classification Report : \n',metrics.classification_report(y_test, gbcl_y_predict))
gbcl_conf_table = gbcl_confusion_matrix
a10 = (gbcl_conf_table[0,0] + gbcl_conf_table[1,1]) / (gbcl_conf_table[0,0] + gbcl_conf_table[0,1] + gbcl_conf_table[1,0] + gbcl_conf_table[1,1])
p10 = gbcl_conf_table[1,1] / (gbcl_conf_table[1,1] + gbcl_conf_table[0,1])
r10 = gbcl_conf_table[1,1] / (gbcl_conf_table[1,1] + gbcl_conf_table[1,0])
f10 = (2 * p10 * r10) / (p10 + r10)
print("\nAccuracy : ",round(a10,2))
print("Precision : ",round(p10,2))
print("Recall : ",round(r10,2))
print("F1 score : ",round(f10,2))
plt.clf()
plt.figure(figsize = (12,8))
plt.imshow(gbcl_confusion_matrix, interpolation = 'nearest', cmap = "GnBu")
classNames = ['Parkinsons','Healty']
plt.title("Confusion Matrix GradientBoost Classifier Model")
plt.ylabel("Actual Label")
plt.xlabel("Predicted Label")
tick_marks = np.arange(len(classNames))
plt.xticks(tick_marks, classNames, rotation=45)
plt.yticks(tick_marks, classNames)
s = [['TN','FP'],['FN','TP']]
for i in range(2):
for j in range(2):
plt.text(j,i,str(s[i][j])+" = "+str(gbcl_confusion_matrix[i][j]))
plt.show()
mdllsts = []
mdllsts.append(['Decision Tree - Entropy', dt_model.score(X_train, y_train) * 100, dt_model.score(X_test, y_test) * 100, r5 * 100, p5 * 100])
mdllsts.append(['Decision Tree - Gini', dt1_model.score(X_train, y_train) * 100, dt1_model.score(X_test, y_test) * 100, r6 * 100, p6 * 100])
mdllsts.append(['Random Forest Classifier',rfc_model.score(X_train, y_train) * 100, rfc_model.score(X_test, y_test) * 100, r7 * 100, p7 * 100])
mdllsts.append(['Bagging Classifier', bgcl_model.score(X_train, y_train) * 100, bgcl_model.score(X_test, y_test) * 100, r8 * 100, p8 * 100])
mdllsts.append(['AdaBoost Classifier', abcl_model.score(X_train, y_train) * 100, abcl_model.score(X_test, y_test) * 100, r9 * 100, p9 * 100])
mdllsts.append(['Gradient Boosting Classifier', gbcl_model.score(X_train, y_train) * 100, gbcl_model.score(X_test, y_test) * 100, r10 * 100, p10 * 100])
mdl_e_df = pd.DataFrame(mdllsts, columns = ['Model', 'Accuracy Score of Training Data', 'Accuracy Score of Test Data', 'Recall Score', 'Precision Score'])
mdl_e_df
modellsts = []
modellsts.append(['Logistic Regression', lrg_model.score(X_train, y_train) * 100, lrg_model.score(X_test, y_test) * 100, r1 * 100, p1 * 100])
modellsts.append(['Gaussian Naïve Bayes', gnb_model.score(X_train, y_train) * 100, gnb_model.score(X_test, y_test) * 100, r2 * 100, p2 * 100])
modellsts.append(['K-Nearest Neighbour', knn_model.score(X_train, y_train) * 100, knn_model.score(X_test, y_test) * 100, r3 * 100, p3 * 100])
modellsts.append(['Support Vector Classifier', svc_model.score(X_train, y_train) * 100, svc_model.score(X_test, y_test) * 100, r4 * 100, p4 * 100])
modellsts.append(['Decision Tree - Entropy', dt_model.score(X_train, y_train) * 100, dt_model.score(X_test, y_test) * 100, r5 * 100, p5 * 100])
modellsts.append(['Decision Tree - Gini', dt1_model.score(X_train, y_train) * 100, dt1_model.score(X_test, y_test) * 100, r6 * 100, p6 * 100])
modellsts.append(['Random Forest Classifier',rfc_model.score(X_train, y_train) * 100, rfc_model.score(X_test, y_test) * 100, r7 * 100, p7 * 100])
modellsts.append(['Bagging Classifier', bgcl_model.score(X_train, y_train) * 100, bgcl_model.score(X_test, y_test) * 100, r8 * 100, p8 * 100])
modellsts.append(['AdaBoost Classifier', abcl_model.score(X_train, y_train) * 100, abcl_model.score(X_test, y_test) * 100, r9 * 100, p9 * 100])
modellsts.append(['Gradient Boosting Classifier', gbcl_model.score(X_train, y_train) * 100, gbcl_model.score(X_test, y_test) * 100, r10 * 100, p10 * 100])
mdl_df = pd.DataFrame(modellsts, columns = ['Model', 'Accuracy Score of Training Data', 'Accuracy Score of Test Data', 'Recall Score', 'Precision Score'])
mdl_df
from sklearn.metrics import roc_curve, roc_auc_score
print('\n\nReceiver Operating Characteristic (ROC) curve to evalute the classifier output quality. If area of curve is closer to 1 which means better the model and if area of curve is closer to 0 which means poor the model.')
lrg_fpr, lrg_tpr, lrg_threshold = metrics.roc_curve(y_test, lrg_y_predict)
lrg_roc_auc = metrics.roc_auc_score(y_test, lrg_y_predict)
fig1_graph = plt.figure(figsize=(15,4))
fig1_graph.add_subplot(1,3,1)
plt.plot(lrg_fpr, lrg_tpr, label='Logistic Regression Model (area = %0.2f)' % lrg_roc_auc,color='darkorange')
plt.plot([0, 1], [0, 1],color='navy', linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic (ROC)')
plt.legend(loc="lower right")
gnb_fpr, gnb_tpr, gnb_threshold = metrics.roc_curve(y_test, gnb_y_predict)
gnb_roc_auc = metrics.roc_auc_score(y_test, gnb_y_predict)
fig1_graph = plt.figure(figsize=(15,4))
fig1_graph.add_subplot(1,3,2)
plt.plot(gnb_fpr, gnb_tpr, label='Gaussian Naïve Bayes Model (area = %0.2f)' % gnb_roc_auc,color='darkorange')
plt.plot([0, 1], [0, 1],color='navy', linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic (ROC)')
plt.legend(loc="lower right")
knn_fpr, knn_tpr, knn_threshold = metrics.roc_curve(y_test, knn_y_predict)
knn_roc_auc = metrics.roc_auc_score(y_test, knn_y_predict)
fig1_graph = plt.figure(figsize=(15,4))
fig1_graph.add_subplot(1,3,3)
plt.plot(knn_fpr, knn_tpr, label='K-NN Model (area = %0.2f)' % knn_roc_auc,color='darkorange')
plt.plot([0, 1], [0, 1],color='navy', linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic (ROC)')
plt.legend(loc="lower right")
svm_fpr, svm_tpr, svm_threshold = metrics.roc_curve(y_test, svm_y_predict)
svm_roc_auc = metrics.roc_auc_score(y_test, svm_y_predict)
fig1_graph = plt.figure(figsize=(15,4))
fig1_graph.add_subplot(2,3,1)
plt.plot(svm_fpr, svm_tpr, label='Support Vector Machine Model (area = %0.2f)' % svm_roc_auc,color='darkorange')
plt.plot([0, 1], [0, 1],color='navy', linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic (ROC)')
plt.legend(loc="lower right")
dt_fpr, dt_tpr, dt_threshold = metrics.roc_curve(y_test, dt_y_predict)
dt_roc_auc = metrics.roc_auc_score(y_test, dt_y_predict)
fig1_graph = plt.figure(figsize=(15,4))
fig1_graph.add_subplot(2,3,2)
plt.plot(dt_fpr, dt_tpr, label='Decision Tree (Entropy) Model (area = %0.2f)' % dt_roc_auc,color='darkorange')
plt.plot([0, 1], [0, 1],color='navy', linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic (ROC)')
plt.legend(loc="lower right")
dt1_fpr, dt1_tpr, dt1_threshold = metrics.roc_curve(y_test, dt1_y_predict)
dt1_roc_auc = metrics.roc_auc_score(y_test, dt1_y_predict)
fig1_graph = plt.figure(figsize=(15,4))
fig1_graph.add_subplot(2,3,3)
plt.plot(dt1_fpr, dt1_tpr, label='Decision Tree (Gini) Model (area = %0.2f)' % dt1_roc_auc,color='darkorange')
plt.plot([0, 1], [0, 1],color='navy', linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic (ROC)')
plt.legend(loc="lower right")
rfc_fpr, rfc_tpr, rfc_threshold = metrics.roc_curve(y_test, rfc_y_predict)
rfc_roc_auc = metrics.roc_auc_score(y_test, rfc_y_predict)
fig1_graph = plt.figure(figsize=(15,4))
fig1_graph.add_subplot(3,3,1)
plt.plot(rfc_fpr, rfc_tpr, label='Random Forest Classifier Model (area = %0.2f)' % rfc_roc_auc,color='darkorange')
plt.plot([0, 1], [0, 1],color='navy', linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic (ROC)')
plt.legend(loc="lower right")
bgcl_fpr, bgcl_tpr, bgcl_threshold = metrics.roc_curve(y_test, bgcl_y_predict)
bgcl_roc_auc = metrics.roc_auc_score(y_test, bgcl_y_predict)
fig1_graph = plt.figure(figsize=(15,4))
fig1_graph.add_subplot(3,3,2)
plt.plot(bgcl_fpr, bgcl_tpr, label='Bagging Classifier Model (area = %0.2f)' % bgcl_roc_auc,color='darkorange')
plt.plot([0, 1], [0, 1],color='navy', linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic (ROC)')
plt.legend(loc="lower right")
abcl_fpr, abcl_tpr, abcl_threshold = metrics.roc_curve(y_test, abcl_y_predict)
abcl_roc_auc = metrics.roc_auc_score(y_test, abcl_y_predict)
fig1_graph = plt.figure(figsize=(15,4))
fig1_graph.add_subplot(3,3,3)
plt.plot(abcl_fpr, abcl_tpr, label='AdaBoosting Classifier Model (area = %0.2f)' % abcl_roc_auc,color='darkorange')
plt.plot([0, 1], [0, 1],color='navy', linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic (ROC)')
plt.legend(loc="lower right")
gbcl_fpr, gbcl_tpr, gbcl_threshold = metrics.roc_curve(y_test, gbcl_y_predict)
gbcl_roc_auc = metrics.roc_auc_score(y_test, gbcl_y_predict)
fig1_graph = plt.figure(figsize=(15,4))
fig1_graph.add_subplot(4,3,2)
plt.plot(gbcl_fpr, gbcl_tpr, label='Gradient Boosting Classifier Model (area = %0.2f)' % gbcl_roc_auc,color='darkorange')
plt.plot([0, 1], [0, 1],color='navy', linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic (ROC)')
plt.legend(loc="lower right")
plt.figure(figsize=(25,10))
splot=sns.barplot(x = mdl_df['Model'], y = mdl_df['Accuracy Score of Test Data'], data = mdl_df)
for p in splot.patches:
splot.annotate(format(p.get_height(), '.1f'),
(p.get_x() + p.get_width() / 2., p.get_height()),
ha = 'center', va = 'center',
size = 25,
xytext = (0, 9),
textcoords = 'offset points')
splot.set_xticklabels(splot.get_xmajorticklabels(), fontsize = 20,rotation=45)
plt.xlabel('Model', fontsize = 35)
plt.ylabel('Accuracy Score', fontsize = 35)
plt.title('\nComparison of Classification Models\n\n', fontsize = 40)